import os

if True:
    batch1_tok100k = '/data2/indexed_dataset/batch1_batch2_tok100k/batch1_tok100k'
    batch2_tok100k = '/data2/indexed_dataset/batch1_batch2_tok100k/batch2_tok100k'
    data_prefix = [
        2.242990654,
        os.path.join(batch1_tok100k, 'en_dedup-md5-pile-openwebtext2_text_document'),
        5.046728972,
        os.path.join(batch1_tok100k, 'en_dedup-md5-pile-pile-cc_text_document'),
        13.64485981,
        os.path.join(batch1_tok100k, 'wudao-9_text_document'),
        2.336448598,
        os.path.join(batch1_tok100k, 'code_dedup-md5-pile-github_text_document'),
        1.869158879,
        os.path.join(batch1_tok100k, 'codegeex_text_document'),
        1.588785047,
        os.path.join(batch1_tok100k, 'en_dedup-md5-pile-wikipedia_en_text_document'),
        2.336448598,
        os.path.join(batch1_tok100k, 'cn_baike_text_document'),
        4.205607477,
        os.path.join(batch1_tok100k, 'pile-books_text_document'),
        0.186915888,
        os.path.join(batch1_tok100k, 'cn_ebook_merge_maxlen_text_document'),
        2.429906542,
        os.path.join(batch1_tok100k, 'pile-papers_text_document'),
        1.869158879,
        os.path.join(batch1_tok100k, 'en_dedup-md5-pile-stackexchange_text_document'),
        0.747663551,
        os.path.join(batch1_tok100k, 'cn_zhihu_text_document'),

        31.77570093,
        os.path.join(batch2_tok100k, 'ccnews_text_document'),
        12.42990654,
         os.path.join(batch2_tok100k, 'c4_text_document'),
        11.58878505,
        os.path.join(batch2_tok100k, 'wudao-3-8_text_document'),
        1.869158879,
        os.path.join(batch2_tok100k, 'hf-wiki_text_document'),
        0.654205607,
        os.path.join(batch2_tok100k, 'sjt_text_document'),
        1.214953271,
        os.path.join(batch2_tok100k, 'col_text_document'),
        1.121495327,
        os.path.join(batch2_tok100k, 'byg-cn_text_document'),
        0.093457944,
        os.path.join(batch2_tok100k, 'qa_text_document'),
        0.747663551,
        os.path.join(batch2_tok100k, 'wenge-zhihu-high_text_document'),
    ]

    batch6_tok100k = '/data/binary_data'
    data_prefix = [
        0.05,
        os.path.join(batch6_tok100k, 'cn_baike_text_document'),
        0.075,
        os.path.join(batch6_tok100k, 'cn_book_paper_text_document'),
        0.1,
        os.path.join(batch6_tok100k, 'code_v02_train_text_document'),
        0.05,
        os.path.join(batch6_tok100k, 'en_dedup-md5-pile-stackexchange_text_document'),
        0.05,
        os.path.join(batch6_tok100k, 'en_dedup-md5-pile-wikipedia_en_text_document'),
        0.3,
        os.path.join(batch6_tok100k, 'falcon_batch05_text_document'),
        0.015,
        os.path.join(batch6_tok100k, 'kbqa_text_document'),
        0.0475,
        os.path.join(batch6_tok100k, 'mnbvc_text_document'),
        0.015,
        os.path.join(batch6_tok100k, 'newspaper_text_document'),
        0.1,
        os.path.join(batch6_tok100k, 'red-arxiv_text_document'),
        0.01,
        os.path.join(batch6_tok100k, 'wg_batch2_zhihu_text_document'),
        0.0025,
        os.path.join(batch6_tok100k, 'wg_toutiao_content300_up100_text_document'),
        0.05,
        os.path.join(batch6_tok100k, 'wudao-9_text_document'),
        0.085,
        os.path.join(batch6_tok100k, 'cn_baike_text_document'),
        0.1,
        os.path.join(batch6_tok100k, 'red-book_text_document'),
    ]

    batch6_tok100k = '/data/20230702'
    batch5_tok100k = '/data/20230704'
    batch4_tok100k = '/data2/indexed_dataset/20230614'
    batch3_tok100k = '/data2/indexed_dataset/20230605'
    batch1_tok100k = '/data2/indexed_dataset/230530'
    code_tok100k = '/data2/indexed_dataset/230513'
    data_prefix = [
        0.000029445,
        os.path.join(batch1_tok100k, 'clue_mmlu_text_document'),
        0.000072158,
        os.path.join(batch1_tok100k, 'coig-part_text_document'),
        0.001963836,
        os.path.join(batch1_tok100k, 'oig-part_text_document'),
        0.000073886,
        os.path.join(batch1_tok100k, 'jiaocai-clean_text_document'),
        0.007255237,
        os.path.join(batch1_tok100k, 'paper_full_processed_text_document'),
        0.045495293,
        os.path.join(batch1_tok100k, 'wg_batch2_zhihu_text_document'),
        0.007486616,
        os.path.join(batch1_tok100k, 'newspaper_text_document'),
        0.039105132,
        os.path.join(batch1_tok100k, 'csdn_text_document'),
        0.001311223,
        os.path.join(batch1_tok100k, 'wg_toutiao_content300_up100_text_document'),
        0.010550114,
        os.path.join(batch1_tok100k, 'en_dedup-md5-pile-wikipedia_en_text_document'),
        0.0528199,
        os.path.join(batch1_tok100k, 'cn_baike_text_document'),
        0.076955788,
        os.path.join(batch1_tok100k, 'pile-books_text_document'),
        0.054428893,
        os.path.join(batch1_tok100k, 'pile-papers_text_document'),
        0.000263018,
        os.path.join(batch1_tok100k, 'paper_abstract_processed_text_document'),
        0.00681505,
        os.path.join(code_tok100k, 'github-issues-filtered-structured_text_document'),
        0.069890943,
        os.path.join(batch3_tok100k, 'red-arxiv_text_document'),
        0.049982051,
        os.path.join(batch3_tok100k, 'red-stack_text_document'),
        0.00010298,
        os.path.join(batch3_tok100k, 'medqa_cn_text_document'),
        0.000390966,
        os.path.join(batch3_tok100k, 'medqa_en_text_document'),
        0.000008794,
        os.path.join(batch4_tok100k, 'norm-competition-math_text_document'),
        0.00001184,
        os.path.join(batch4_tok100k, 'norm-prm800k_text_document'),
        0.000073169,
        os.path.join(batch4_tok100k, 'norm-schoolmath_text_document'),
        0.030593583,
        os.path.join(batch4_tok100k, 'norm-tiger-zh_text_document'),
        0.000026382,
        os.path.join(batch4_tok100k, 'norm-math_text_document'),
        0.000202747,
        os.path.join(batch4_tok100k, 'norm-mathamps-khan_text_document'),
        0.001981625,
        os.path.join(batch4_tok100k, 'norm-mathamps-mathematica_text_document'),
        0.30118148,
        os.path.join(batch6_tok100k, 'falcon_batch2_text_document'),
        0.150872077,
        os.path.join(batch6_tok100k, 'wudao_norm_1_text_document'),
        0.013207644,
        os.path.join(code_tok100k, 'c_text_document'),
        0.010886165,
        os.path.join(code_tok100k, 'cpp_text_document'),
        0.006266925,
        os.path.join(code_tok100k, 'git-commits-cleaned_text_document'),
        0.005772797,
        os.path.join(code_tok100k, 'go_text_document'),
        0.016068358,
        os.path.join(code_tok100k, 'java_text_document'),
        0.012757458,
        os.path.join(code_tok100k, 'javascript_text_document'),
        0.001980534,
        os.path.join(code_tok100k, 'jupyter-structured-clean-dedup_text_document'),
        0.002347849,
        os.path.join(code_tok100k, 'jupyter-scripts-dedup-filtered_text_document'),
        0.016954185,
        os.path.join(code_tok100k, 'python_text_document'),
        0.000754847,
        os.path.join(code_tok100k, 'shell_text_document'),
        0.003059014,
        os.path.join(code_tok100k, 'sql_text_document'),
        0.000059718,
        os.path.join(batch5_tok100k, 'ccmatrix-zh-en_text_document'),
        0.000705214,
        os.path.join(batch5_tok100k, 'tsl2019_text_document'),
        0.001058247,
        os.path.join(batch5_tok100k, 'csl_text_document'),
        0.001156925,
        os.path.join(batch6_tok100k, 'kg-zy_text_document'),
        0.00001687,
        os.path.join(batch5_tok100k, 'mnbvc-gov_text_document'),
        0.001156909,
        os.path.join(batch5_tok100k, 'mnbvc-qa_text_document'),
        0.000315041,
        os.path.join(batch5_tok100k, 'mnbvc-qaen_text_document'),
        0.011401359,
        os.path.join(batch5_tok100k, 'mnbvc-wiki_text_document'),
        0.000877942,
        os.path.join(batch5_tok100k, 'norm-baikeqa_text_document'),
        0.000397255,
        os.path.join(batch5_tok100k, 'mnbvc-xxqg_text_document'),
        0.001670916,
        os.path.join(batch5_tok100k, 'mnbvc-news_text_document'),
        0.008678739,
        os.path.join(batch5_tok100k, 'mnbvc-co-ann-report_text_document'),
    ]

l = len(data_prefix)//2

dataset = ""
for i in range(l):
    name = f"DATASET_{i+1}"
    value = data_prefix[i*2+1]
    weight = data_prefix[i*2]
    #print(name, value)
    print(f'''{name}="{value}"''')
    if dataset == "":
        pass
    else:
        dataset += " "
    dataset += "%0.9f "%weight
    dataset += "${%s}"%name
print('''DATASET="%s"'''%dataset)
